R Markdown

This is an initial report as a test to analyze the causality of variables of development projects (GitHub) and quality characteristics of the software (Sonar Cloud). The data has been obtained through the public api of both platforms, and the json data has been filtered and pre-processed using an intermediate Mongo documentary database.

The result of the preprocessing has been stored in a CSV file. The first step is to import the data from that file.

library(readr)
sonar_git <- read_delim("../data/sonar-git.csv", ";", quote = "\\\"", escape_double = FALSE, locale = locale(), trim_ws = TRUE)
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   project = col_character(),
##   version = col_character(),
##   from = col_datetime(format = ""),
##   to = col_datetime(format = ""),
##   file_complexity_distribution = col_character(),
##   files = col_character(),
##   function_complexity_distribution = col_character(),
##   functions = col_character(),
##   generated_lines = col_character(),
##   generated_ncloc = col_character(),
##   info_violations = col_character(),
##   line_coverage = col_character(),
##   new_line_coverage = col_character(),
##   lines = col_character(),
##   ncloc = col_character(),
##   lines_to_cover = col_character(),
##   new_lines_to_cover = col_character(),
##   sqale_rating = col_character(),
##   alert_status = col_character(),
##   security_hotspots = col_character()
##   # ... with 1 more columns
## )
## See spec(...) for full column specifications.
## Warning: 113 parsing failures.
## row col    expected      actual                    file
##   6  -- 111 columns 123 columns '../data/sonar-git.csv'
##   7  -- 111 columns 123 columns '../data/sonar-git.csv'
##   8  -- 111 columns 123 columns '../data/sonar-git.csv'
##   9  -- 111 columns 117 columns '../data/sonar-git.csv'
##  10  -- 111 columns 117 columns '../data/sonar-git.csv'
## ... ... ........... ........... .......................
## See problems(...) for more details.

Then, we filter our the matrix (with 107 variables) to manage those interesting (after some preliminary analyises were done). With the filtered data, we sown the descriptive statistics.

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
dataset1<-select(sonar_git, 
                  project,
                  version,
                  commits,
                  changes_by_commit,
                  committers,
                  committers_weight,
                  bugs,
                  code_smells,
                  complexity,
                  violations,
                  duplicated_blocks,
                  open_issues)


summary(dataset1)
##    project            version             commits       changes_by_commit 
##  Length:224         Length:224         Min.   :  0.00   Min.   :    0.00  
##  Class :character   Class :character   1st Qu.:  6.00   1st Qu.:   43.18  
##  Mode  :character   Mode  :character   Median : 29.00   Median :  201.02  
##                                        Mean   : 76.59   Mean   :  954.96  
##                                        3rd Qu.: 73.50   3rd Qu.:  458.93  
##                                        Max.   :740.00   Max.   :34902.00  
##    committers     committers_weight      bugs         code_smells   
##  Min.   : 0.000   Min.   :0.00000   Min.   :  0.00   Min.   :    0  
##  1st Qu.: 1.000   1st Qu.:0.01688   1st Qu.:  0.00   1st Qu.:  109  
##  Median : 4.000   Median :0.06865   Median :  1.00   Median :  151  
##  Mean   : 5.161   Mean   :0.22962   Mean   : 41.88   Mean   : 1907  
##  3rd Qu.: 6.000   3rd Qu.:0.23140   3rd Qu.: 31.00   3rd Qu.: 1081  
##  Max.   :32.000   Max.   :1.00000   Max.   :923.00   Max.   :40618  
##    complexity       violations      duplicated_blocks  open_issues   
##  Min.   :     0   Min.   :    0.0   Min.   :   0.00   Min.   :    0  
##  1st Qu.:  2420   1st Qu.:  185.0   1st Qu.:  18.75   1st Qu.:    2  
##  Median :  5770   Median :  590.5   Median :  52.00   Median :  133  
##  Mean   : 13295   Mean   : 2058.6   Mean   : 130.95   Mean   : 1740  
##  3rd Qu.: 12033   3rd Qu.:  892.0   3rd Qu.:  86.00   3rd Qu.:  630  
##  Max.   :143551   Max.   :42591.0   Max.   :1931.00   Max.   :42590

Including Plots

First we analyse commits/committers relationship

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
p <- ggplot(dataset1, aes(commits, project)) +
    geom_boxplot(fill="gray")
p

p <- ggplot(dataset1, aes(committers, project)) +
    geom_boxplot(fill="gray")
p

sp <- ggplot(dataset1, aes(x=commits, y=committers)) +
   geom_point(aes(colour=project)) +
   stat_density_2d(aes(fill = ..level..), geom="polygon", alpha=0.2) + scale_fill_gradient(low="green", high="red")

sp + theme_classic()

zoom_sp <- sp + coord_cartesian(xlim = c(0, 120), ylim = c(0, 10))
zoom_sp + theme_classic()

kd <- with(dataset1, MASS::kde2d(committers, commits, n = 50))
fig <- plot_ly(x = kd$x, y = kd$y, z = kd$z) %>% add_surface()
 
fig

As preliminary analysis, we compute correlation values and draw a matrix of scatter plots:

dataset_only_data<-select(dataset1, -1, -2)
M <- cor(dataset_only_data)
plot(dataset_only_data)

library(corrgram)
## Warning: package 'corrgram' was built under R version 3.6.3
## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust gclus
corrgram(dataset_only_data, order=FALSE, lower.panel=panel.shade,
   upper.panel=panel.pie, text.panel=panel.txt,
   main="correlation between variables")

library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.3
## corrplot 0.84 loaded
corrplot(M, method = "circle")

corrplot(M, method = "ellipse")

corrplot(M, method = "number")

col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#AADD77", "#77AA44"))
res1 <- cor.mtest(dataset_only_data, conf.level = .95)
corrplot(M, method = "color", col = col(200),
          type = "upper", order = "original", number.cex = .8,
          addCoef.col = "black", # Add coefficient of correlation
          tl.col = "black", tl.srt = 90, # Text label color and rotation
          # Combine with significance
          p.mat = res1$p, sig.level = 0.05, insig = "blank", 
          # hide correlation coefficient on the principal diagonal
          diag = FALSE)

We focus on some variables where we observe certain correlation. First, we observe the behaviour of commits against complexity

library(ggplot2)
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 3.6.3
## Loading required package: magrittr
monica<-dataset1[(dataset1[,'project']=='monica'), 1:12]
sp <- ggplot(monica, aes(x=commits, y=complexity)) + 
 geom_point(shape=16, aes(colour=project))+
 geom_smooth(method=lm,  linetype="dashed",
            color="darkred", fill="grey") +
coord_cartesian(xlim = c(0, 255), ylim = c(1500, 4300))
sp + stat_cor(method = "pearson", label.x = 160, label.y = 4300)
## `geom_smooth()` using formula 'y ~ x'

sonar_dotnet<-dataset1[(dataset1[,'project']=='sonar-dotnet'), 1:12]
sp <- ggplot(sonar_dotnet, aes(x=committers, y=duplicated_blocks)) + 
 geom_point(shape=16, aes(colour=project))+
 geom_smooth(method=lm,  linetype="dashed",
          color="darkred", fill="grey") +
 coord_cartesian(xlim = c(0, 15), ylim = c(35, 90))
sp + stat_cor(method = "pearson", label.x = 8, label.y = 75)
## `geom_smooth()` using formula 'y ~ x'

sonarqube<-dataset1[(dataset1[,'project']=='sonarqube'), 1:12]
sp <- ggplot(sonarqube, aes(x=committers, y=bugs)) + 
 geom_point(shape=16, aes(colour=project))+
 geom_smooth(method=lm,  linetype="dashed",
          color="darkred", fill="grey") +
coord_cartesian(xlim = c(0, 31), ylim = c(48, 64))
sp + stat_cor(method = "pearson", label.x = 12, label.y =58)
## `geom_smooth()` using formula 'y ~ x'

jacoco<-dataset1[(dataset1[,'project']=='jacoco'), 1:12]
sp <- ggplot(jacoco, aes(x=committers, y=code_smells)) + 
 geom_point(shape=16, aes(colour=project))+
 geom_smooth(method=lm,  linetype="dashed",
          color="darkred", fill="grey")
sp + stat_cor(method = "pearson", label.x = 3, label.y =230)
## `geom_smooth()` using formula 'y ~ x'

 ggplot(dataset1, aes(x=commits, y=complexity)) +
   geom_point(aes(colour=project))

 ggplot(dataset1, aes(x=commits, y=complexity, colour=project)) + 
   geom_point(shape=16)+
   geom_smooth(se = FALSE, method = lm)
## `geom_smooth()` using formula 'y ~ x'

sp <- ggplot(dataset1, aes(x=commits, y=complexity)) + 
   geom_point(shape=16, aes(colour=project))+
   geom_smooth(method=lm,  linetype="dashed",
              color="darkred", fill="grey")
sp
## `geom_smooth()` using formula 'y ~ x'

zoom_sp <- sp + coord_cartesian(xlim = c(0, 500), ylim = c(0, 20000))
zoom_sp
## `geom_smooth()` using formula 'y ~ x'

sp <- ggplot(dataset1, aes(x=commits, y=complexity)) +
   geom_point(aes(colour=project))

sp + geom_density_2d()

sp + stat_density_2d(aes(fill = ..level..), geom="polygon") + scale_fill_gradient(low="green", high="red")

committers_density <- ggplot(dataset1, aes(x=committers, fill=project)) + 
   geom_density(aes(group = project, 
                     colour = project, 
                     fill = project),
                     alpha=.1) + 
   theme(legend.position = "right")
committers_density

zoom_sp <- committers_density + coord_cartesian(xlim = c(0, 10), ylim = c(0, 0.75))
zoom_sp

commits_density <- ggplot(dataset1, aes(x=commits, fill=project)) + 
   geom_density(aes(group = project, 
                     colour = project, 
                     fill = project),
                     alpha=.1) + 
   theme(legend.position = "right")
commits_density

zoom_sp <- commits_density + coord_cartesian(xlim = c(0, 150), ylim = c(0, 0.025))
zoom_sp

We carry out a hierarchical clustering with all the variables and take 4 clusters

ddata1 <- dist(dataset_only_data)
gdata1 <- hclust(ddata1, method = "centroid")
plot(gdata1, sub = "example", xlab = "cases", ylab = "high") 
rect.hclust(tree = gdata1, k = 4,  border = c("red", "blue", "green", "orange"))

clusters <- cutree(tree = gdata1, k = 4)
clusters
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [186] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 4 4 4 4 1 4 1 1 1 1 1
## [223] 1 1
dataset_clusters <- dataset_only_data
dataset_clusters$cluster <- factor(clusters)


sp <- ggplot(dataset_clusters, aes(x=commits, y=complexity)) + 
   geom_point(shape=16, aes(colour=cluster))+
   geom_smooth(method=lm,  linetype="dashed",
              color="darkred", fill="grey")

sp
## `geom_smooth()` using formula 'y ~ x'

##K-means scaled values We carry out a K-means clustering with all the variables scaled and considering 4 clusters

library(cluster)
## Warning: package 'cluster' was built under R version 3.6.3
#Method for determine best number of clusters in K-means. Look for a bend or elbow in the sum of squared error (SSE) scree plot

mydata <- dataset_only_data
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
  for (i in 2:10) wss[i] <- sum(kmeans(mydata,
                                       centers=i)$withinss)
plot(1:10, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares")

zdata1 <- scale(dataset_only_data)
kcdata1 <- kmeans(x = zdata1, centers = 4)
kcdata1$cluster
##   [1] 4 3 4 1 3 4 4 4 4 4 4 4 1 4 4 4 4 1 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
##  [38] 4 4 3 4 3 4 4 4 4 3 4 3 4 3 3 3 3 3 3 4 4 4 4 4 1 4 4 4 4 4 4 4 4 1 4 4 4
##  [75] 4 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 3 3 4 4 4 3 4 4 4 4 1 1 1 1 1 1 1 1 1 4 4
## [112] 1 1 4 4 4 4 4 4 4 4 4 1 4 4 4 4 4 3 1 3 4 4 4 4 4 4 4 4 4 4 1 4 4 4 4 4 3
## [149] 1 3 4 4 4 4 4 4 4 4 4 4 4 1 4 4 4 4 4 3 1 3 4 3 3 4 4 3 3 4 4 3 3 4 4 1 1
## [186] 1 4 4 4 1 1 1 4 4 4 1 1 1 4 4 4 4 4 4 4 4 4 4 4 2 2 2 2 2 2 4 2 4 4 1 4 4
## [223] 4 1
dataset_clusters$cluster2 <- factor(kcdata1$cluster)


sp <- ggplot(dataset_clusters, aes(x=commits, y=complexity, colour=cluster2, shape=cluster2)) + 
    geom_point(shape=16, aes(colour=cluster2))+
    geom_smooth(method=lm,  linetype="dashed", color="darkred", fill="grey")

sp
## `geom_smooth()` using formula 'y ~ x'

We performed the characterization of clusters for the k-means algorithm

par(mfrow=c(1,1))
library(lattice)
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:corrgram':
## 
##     panel.fill
splom(~ dataset_clusters[1:9], groups = cluster2, data = dataset_clusters, pch = 16)

library(vioplot)
## Warning: package 'vioplot' was built under R version 3.6.3
## Loading required package: sm
## Warning: package 'sm' was built under R version 3.6.3
## Package 'sm', version 2.2-5.6: type help(sm) for summary information
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.6.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
x1 <- dataset_clusters$code_smells[dataset_clusters$cluster2==1]
x2 <- dataset_clusters$code_smells[dataset_clusters$cluster2==2]
x3 <- dataset_clusters$code_smells[dataset_clusters$cluster2==3]
x4 <- dataset_clusters$code_smells[dataset_clusters$cluster2==4]

vioplot(x1, x2, x3, x4, names=c("1", "2", "3", "4"), col="grey")

title("Code smells per cluster")

x1 <- dataset_clusters$commits[dataset_clusters$cluster2==1]
x2 <- dataset_clusters$commits[dataset_clusters$cluster2==2]
x3 <- dataset_clusters$commits[dataset_clusters$cluster2==3]
x4 <- dataset_clusters$commits[dataset_clusters$cluster2==4]
vioplot(x1, x2, x3, x4, names=c("1", "2", "3", "4"), col="grey")
title("Commits per cluster")

We compute correlation and scatter plots for clusters

c1<-dataset_clusters[(dataset_clusters[,'cluster2']=='1'), 1:10]
c2<-dataset_clusters[(dataset_clusters[,'cluster2']=='2'), 1:10]
c3<-dataset_clusters[(dataset_clusters[,'cluster2']=='3'), 1:10]
c4<-dataset_clusters[(dataset_clusters[,'cluster2']=='4'), 1:10]

corrplot(cor(c1), method="number")
## Warning in cor(c1): the standard deviation is zero

corrplot(cor(c2), method="number")

corrplot(cor(c3), method="number")

corrplot(cor(c4), method="number")

col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#AADD77", "#77AA44"))

res1 <- cor.mtest(c1, conf.level = .95)
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
corrplot(cor(c1), method = "color", col = col(200),
   type = "upper", order = "original", number.cex = .8,
   addCoef.col = "black",  tl.col = "black", tl.srt = 90,
   p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)
## Warning in cor(c1): the standard deviation is zero

res1 <- cor.mtest(c2, conf.level = .95)
corrplot(cor(c2), method = "color", col = col(200),
   type = "upper", order = "original", number.cex = .8,
   addCoef.col = "black",  tl.col = "black", tl.srt = 90,
   p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)

res1 <- cor.mtest(c3, conf.level = .95)
corrplot(cor(c3), method = "color", col = col(200),
   type = "upper", order = "original", number.cex = .8,
   addCoef.col = "black",  tl.col = "black", tl.srt = 90,
   p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)

res1 <- cor.mtest(c4, conf.level = .95)
corrplot(cor(c4), method = "color", col = col(200),
   type = "upper", order = "original", number.cex = .8,
   addCoef.col = "black",  tl.col = "black", tl.srt = 90,
   p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)

sp <- ggplot(dataset_clusters, aes(x=commits, y=complexity, colour=cluster2, shape=cluster2)) + 
    geom_point()+
    geom_smooth(method=lm,  linetype="dashed", color="darkred", fill="grey")
sp
## `geom_smooth()` using formula 'y ~ x'

Some 3d plots with correlations of several measures by cluster

##Ploting for sonarqube project, cluster 2 and 3 differences

Ploting for sonarqube project, cluster 2 and 3 differences

library(ggplot2)
library(ggpubr)
theme_set(theme_minimal())

dataset_clusters
## # A tibble: 224 x 12
##    commits changes_by_comm~ committers committers_weig~  bugs code_smells
##      <dbl>            <dbl>      <dbl>            <dbl> <dbl>       <dbl>
##  1     115            178.           8           0.0151    41         145
##  2     137            228.          17           0.0091    37          89
##  3      96            180.          13           0.0153    42          85
##  4       5             10.8          1           1         90         110
##  5     266            136.          21           0.0059    90         110
##  6      43            178.          10           0.0287    45         151
##  7      47            225.          10           0.0351    46         151
##  8     106           2097.          12           0.0132    46         151
##  9       8             93.5          3           0.221     46         127
## 10      16           1150.           3           0.147     46         127
## # ... with 214 more rows, and 6 more variables: complexity <dbl>,
## #   violations <dbl>, duplicated_blocks <dbl>, open_issues <dbl>,
## #   cluster <fct>, cluster2 <fct>
clusters<-dataset_clusters
clusters$project = dataset1$project
clusters$version = dataset1$version
sonar<-clusters[(clusters[,'project']=='sonarqube'), 1:14]


p_bugs<-ggplot(dat =sonar, aes(x=version, y=bugs)) + 
   geom_line(aes(group=1)) + 
   geom_point(aes(colour=cluster2, shape=cluster2, group=cluster2), size=3)
p_bugs

p_violations<-ggplot(dat =sonar, aes(x=version, y=violations)) + 
   geom_line(aes(group=1)) + 
   geom_point(aes(colour=cluster2, shape=cluster2, group=cluster2), size=3)
p_violations

p_commits<-ggplot(dat =sonar, aes(x=version, y=commits)) + 
   geom_line(aes(group=1)) + 
   geom_point(aes(colour=cluster2, shape=cluster2, group=cluster2), size=3)

p_commits

figure <- ggarrange(p_bugs, p_violations, p_commits, labels = c("a", "b", "c"), ncol = 1, nrow = 3)
figure

##k-means for normalized values

we perform the kmeans algorithm with normalized values and euclidean distance

library(vegan)
## Warning: package 'vegan' was built under R version 3.6.3
## Loading required package: permute
## Warning: package 'permute' was built under R version 3.6.3
## Registered S3 methods overwritten by 'vegan':
##   method         from      
##   reorder.hclust seriation 
##   rev.hclust     dendextend
## This is vegan 2.5-6
#data normalization
spe.norm <- decostand(dataset_only_data, "normalize") 
spe.ch <- vegdist(spe.norm, "euc")

spe.ch.ward <- hclust(spe.ch, method = "ward.D") 
plot(spe.ch.ward, sub = "Ward method")

#Calinski method
spe.KM.cascade <- cascadeKM(spe.norm, inf.gr = 2, sup.gr = 10, iter = 400, criterion = "ssi")
spe.KM.cascade$results
##        2 groups   3 groups   4 groups   5 groups  6 groups   7 groups
## SSE 10.10758900 7.32699459 5.64852057 4.07213686 3.4075889 2.82913890
## ssi  0.03329901 0.03870528 0.03111807 0.01295001 0.0143994 0.01426338
##       8 groups   9 groups  10 groups
## SSE 2.29278134 1.88894115 1.54124155
## ssi 0.01326347 0.01817455 0.02032346
plot(spe.KM.cascade, sortg = TRUE)

#Silhouette plot
spe.kmeans <- kmeans(spe.norm, centers = 4, nstart = 100)
dissE <- daisy(spe.norm) 
sk <- silhouette(spe.kmeans$cl, dissE) 
plot(sk)

#compute k-means
set.seed(1) 
spe.kmeans <- kmeans(spe.norm, centers = 4, nstart = 100)

#clusters plot
spebc.ward.g <- cutree(spe.ch.ward,k = 4)
table(spe.kmeans$cluster, spebc.ward.g)
##    spebc.ward.g
##      1  2  3  4
##   1  0  5 37  0
##   2 91 66  0  0
##   3  0 10  0  2
##   4  0  0  0 13
clusplot(spe.norm, spe.kmeans$cluster, color = TRUE, shade = TRUE, 
         labels = 2, lines = 0)

dataset_clusters$cluster3 <- factor(spe.kmeans$cluster)

We performed the characterization of clusters for the k-means algorithm

par(mfrow=c(1,1))
library(lattice)
splom(~ dataset_clusters[1:9], groups = cluster3, data = dataset_clusters, pch = 16)

library(vioplot)

x1 <- dataset_clusters$code_smells[dataset_clusters$cluster3==1]
x2 <- dataset_clusters$code_smells[dataset_clusters$cluster3==2]
x3 <- dataset_clusters$code_smells[dataset_clusters$cluster3==3]
x4 <- dataset_clusters$code_smells[dataset_clusters$cluster3==4]

vioplot(x1, x2, x3, x4, names=c("1", "2", "3", "4"), col="grey")

title("Code smells per cluster")

x1 <- dataset_clusters$commits[dataset_clusters$cluster3==1]
x2 <- dataset_clusters$commits[dataset_clusters$cluster3==2]
x3 <- dataset_clusters$commits[dataset_clusters$cluster3==3]
x4 <- dataset_clusters$commits[dataset_clusters$cluster3==4]
vioplot(x1, x2, x3, x4, names=c("1", "2", "3", "4"), col="grey")
title("Commits per cluster")

We compute correlation and scatter plots for clusters

c1<-dataset_clusters[(dataset_clusters[,'cluster3']=='1'), 1:10]
c2<-dataset_clusters[(dataset_clusters[,'cluster3']=='2'), 1:10]
c3<-dataset_clusters[(dataset_clusters[,'cluster3']=='3'), 1:10]
c4<-dataset_clusters[(dataset_clusters[,'cluster3']=='4'), 1:10]

corrplot(cor(c1), method="number")

corrplot(cor(c2), method="number")

corrplot(cor(c3), method="number")

corrplot(cor(c4), method="number")

col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#AADD77", "#77AA44"))

res1 <- cor.mtest(c1, conf.level = .95)
corrplot(cor(c1), method = "color", col = col(200),
   type = "upper", order = "original", number.cex = .8,
   addCoef.col = "black",  tl.col = "black", tl.srt = 90,
   p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)

res1 <- cor.mtest(c2, conf.level = .95)
corrplot(cor(c2), method = "color", col = col(200),
   type = "upper", order = "original", number.cex = .8,
   addCoef.col = "black",  tl.col = "black", tl.srt = 90,
   p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)

res1 <- cor.mtest(c3, conf.level = .95)
corrplot(cor(c3), method = "color", col = col(200),
   type = "upper", order = "original", number.cex = .8,
   addCoef.col = "black",  tl.col = "black", tl.srt = 90,
   p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)

res1 <- cor.mtest(c4, conf.level = .95)
corrplot(cor(c4), method = "color", col = col(200),
   type = "upper", order = "original", number.cex = .8,
   addCoef.col = "black",  tl.col = "black", tl.srt = 90,
   p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)

sp <- ggplot(dataset_clusters, aes(x=commits, y=complexity, colour=cluster3, shape=cluster3)) + 
    geom_point()+
    geom_smooth(method=lm,  linetype="dashed", color="darkred", fill="grey")
sp
## `geom_smooth()` using formula 'y ~ x'

Some 3d plots with correlations of several measures by cluster